Text-分析步骤

一些可能用到的libraries

from bs4 import BeautifulSoup as bsoup import re import os import nltk from nltk.collocations import * from itertools import chain import itertools from nltk.tokenize import RegexpTokenizer from nltk.tokenize import MWETokenizer import matplotlib.pyplot as plt %matplotlib inline from nltk.corpus import reuters

Generate the 100 bigram cllocations:

bigram_measures = nltk.collocations.BigramAssocMeasures() bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words) bigram_finder.apply_freq_filter(2) bigram_finder.apply_word_filter(lambda w: len(w) < 3) top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams

Generate the TF-IDF vectors:

from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vectorizer = TfidfVectorizer(input = 'content', analyzer = 'word') tfidf_vectors = tfidf_vectorizer.fit_transform(patent_words) 或者: from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(analyzer = "word") tfs = tfidf.fit_transform([' '.join(value) for value in tokenized_reuters.values()]) vocab = vectorizer.get_feature_names() for word, weight in zip(vocab, tfs.toarray()[0]): if weight > 0: print (word, ":", weight) 输出到txt文档: save_file = open("patent_student.txt", 'w') vocab = tfidf_vectorizer.get_feature_names() cx = tfidf_vectors.tocoo() #Return the coordinate representation of a sparse matrix for i,j,v in itertools.zip_longest(cx.row, cx.col, cx.data): save_file.write(pids[i] + ',' + vocab[j] + ',' + str(v) + '\n')

Most common words

1.出现次数最多 from nltk.probability import * fd_1 = FreqDist(words) fd_1.most_common(25) 2.出现文章最多 words_2 = list(chain.from_iterable([set(value) for value in tokenized_reuters.values()])) fd_2 = FreqDist(words_2) fd_2.most_common(25) 3.出现次数少的词 lessFreqWords = set([k for k, v in fdist.items() if v < 2]) 或者: lessFreqWords = set(fd_3.hapaxes()) def removeLessFreqWords(fileid): return (fileid, [w for w in tokenized_reuters[fileid] if w not in lessFreqWords])

查看某个词所出现的地方

nltk.Text(reuters.words()).concordance('net')

Creating Count Vectors

from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(analyzer = "word") data_features = vectorizer.fit_transform([' '.join(value) for value in tokenized_reuters.values()]) vocab2 = vectorizer.get_feature_names() for word, count in zip(vocab, data_features.toarray()[0]): if count > 0: print (word, ":", count)

提取二元组

bigram_measures = nltk.collocations.BigramAssocMeasures() bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(all_words) bigram_finder.apply_freq_filter(2) bigram_finder.apply_word_filter(lambda w: len(w) < 3) top_100_bigrams = bigram_finder.nbest(bigram_measures.pmi, 100) # Top-100 bigrams 或者: from nltk.util import ngrams bigrams = ngrams(reuters.words(), n = 2) fdbigram = FreqDist(bigrams) fdbigram.most_common() The following code will find the best 100 bigrams using the PMI scores:(PMI找出最合适的bigram) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = nltk.collocations.BigramCollocationFinder.from_words(reuters.words()) finder.nbest(bigram_measures.pmi, 50)